This notebook explores the use of SingleR
to perform cell-type annotation on datasets from the ScPCA project
SCPCP000007 (Gawad lab data).
Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.
Use this code to obtain the PBMC reference from Zenodo: https://zenodo.org/record/4546839#.Y3PZ5oLMJhE
wget https://zenodo.org/record/4546839/files/ref.Rds
# load the R project
project_root <- here::here()
renv::load(project_root)
* Project '~/ALSF/scpca/sc-data-integration' loaded. [renv 0.15.5]
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(SingleR)
library(celldex)
library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!
utils_dir <- file.path(project_root, "scripts", "utils")
source(file.path(utils_dir, "integration-helpers.R"))
sce_file_suffix <- "processed_citeseq.rds"
integrated_sce_file <- file.path(project_root,
params$integrated_sce_dir,
paste0(params$project_id, "_integrated_", params$integration_method, "_sce.rds")
)
if (!file.exists(integrated_sce_file)){
stop("Integrated SCE file could not be found.")
}
Read in the data:
library_metadata_df <- readr::read_tsv(file.path(project_root, params$library_file))
Rows: 25 Columns: 16── Column specification ───────────────────────────────────────────
Delimiter: "\t"
chr (15): project_name, submitter, library_biomaterial_id, samp...
lgl (1): has_CITEseq
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
integrated_sce <- readr::read_rds(integrated_sce_file)
# Define the unintegrated SCE filenames
sce_file_paths <- library_metadata_df %>%
dplyr::filter(project_name == params$project_id) %>%
dplyr::mutate(sce_file_path = file.path(
project_root,
integration_input_dir,
sample_biomaterial_id,
glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
)) %>%
dplyr::pull(sce_file_path)
SingleR
annotationHere we perform celltype annotation with the given reference in
params$reference on each of the Gawad libraries and look at
their UMAPs colored by celltype.
First, set the reference:
if (params$reference == "hpca") {
ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
require(“ensembldb”)
Warning: Unable to map 1470 of 19363 requested IDs.
Define some functions:
annotate_SingleR <- function(sce, ref_data, label_name = "label.main") {
# label_name is either "label.main" or "label.fine" to label with reference
# broad or fine-grained celltypes, respectively
#label_name <- "label.fine"
# return updated sce and the predictions themselves
preds <- SingleR::SingleR(
test = sce,
ref = ref_data,
labels = colData(ref_data)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(4)
)
# Add `pruned.labels`, where low-confidence annotations are NA, to sce
sce$SingleR_annotations <- preds$pruned.labels
return(
list(
sce = sce,
preds = preds
)
)
}
plot_SingleR <- function(annotation_output, colname) {
# annotation_output: list of sce and preds
# colname: column name to plot
# Make a heatmap and UMAP and print them out
heatmap <- SingleR::plotScoreHeatmap(annotation_output[["preds"]],
# default but let's be explicit
show.pruned = FALSE)
# In case, for `levels()` below.
colData(annotation_output[["sce"]])[,colname] <- as.factor(colData(annotation_output[["sce"]])[,colname])
umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
dplyr::mutate(celltypes = colData(annotation_output[["sce"]])[,colname])
plot_colors <- rainbow( length(levels(umap_df$celltypes)) )
umap <- ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = celltypes) +
geom_point(size = 0.2, alpha = 0.5) +
scale_color_manual(values = plot_colors) +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle(glue::glue("UMAP with top {params$max_celltypes} celltypes shown"))
print(heatmap)
print(umap)
}
# Function to run and plot SingleR
# Return SCE with annotations `SingleR_annotations` column
run_SingleR <- function(sce,
max_celltypes = params$max_celltypes,
viz = TRUE) {
# Print out the library
print(unique( metadata(sce)$library ))
# Run annotations
anno <- annotate_SingleR(sce, ref_data)
# Create an additional column `SingleR_annotations_collapsed` with only max_celltypes
anno[["sce"]]$SingleR_annotations_collapsed <- forcats::fct_lump_n(
anno[["sce"]]$SingleR_annotations,
max_celltypes
)
# plot if TRUE
if (viz) {
# there is no interesting color palette harmonization among library colors here!
plot_SingleR(anno, "SingleR_annotations_collapsed")
}
return(anno[["sce"]])
}
Here we go!
# Read in all SCE files
sce_list <- purrr::map(
sce_file_paths,
readr::read_rds
)
Warning: call dbDisconnect() when finished working with a connection
# Annotate them all, popping out some viz along the way if specified
sce_list_annotated <- purrr::map(sce_list, run_SingleR, viz = TRUE)
[1] "SCPCL000295"
[1] "SCPCL000296"
[1] "SCPCL000297"
[1] "SCPCL000298"
[1] "SCPCL000299"
This section has some UMAPs:
CD123 (leukemia marker) and CD3 (T-cell
marker)# First let's create a df of celltypes and ADTs:
extract_celltypes_adts <- function(sce) {
# data frame of ADT counts
adt_df <- logcounts(altExp(sce)) %>%
as.matrix() %>%
t() %>%
tibble::as_tibble(rownames = "cell_barcode") %>%
tidyr::pivot_longer(dplyr::starts_with("CD"),
names_to = "ADT",
values_to = "logcounts")
# Combine with data frame of annotations
tibble::tibble(celltype = sce$SingleR_annotations,
cell_barcode = rownames(colData(sce)),
library = metadata(sce)$library) %>%
dplyr::inner_join(adt_df) %>%
dplyr::mutate(cell_name = paste(cell_barcode, library, sep = "-"))
}
celltype_adt_df <- purrr::map_df(sce_list_annotated, extract_celltypes_adts)
Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"
# integrated UMAP with individual library cell annotations colored
umap_df <- celltype_adt_df %>%
dplyr::select(celltype, cell_name, library) %>%
dplyr::distinct() %>%
dplyr::bind_cols(
as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
) %>%
dplyr::mutate(UMAP1 = V1,
UMAP2 = V2,
celltype = forcats::fct_lump_n(celltype, params$max_celltypes))
# And let's make a UMAP!
rna_umap <- ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = celltype) +
geom_point(size = 0.3, alpha = 0.4) +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle(glue::glue("Integrated UMAP with top {params$max_celltypes} SingleR-annotated celltypes shown"))
rna_umap
rna_umap +
lims(
x = c(-6, 4),
y = c(-5, 5.5)
) +
ggtitle("The above UMAP, zoomed in")
# function to make UMAPs colored by ADT
make_adt_umap <- function(celltype_adt_df, adt_name) {
umap_df <- celltype_adt_df %>%
dplyr::select(cell_name, ADT, logcounts) %>%
dplyr::filter(ADT %in% adt_name) %>%
dplyr::bind_cols(
as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
) %>%
dplyr::rename(UMAP1 = V1,
UMAP2 = V2)
ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = logcounts) +
geom_point(size = 0.3, alpha = 0.4) +
scale_color_viridis_c() +
labs(
title = glue::glue("{adt_name} expression"),
color = "Normalized expression")
}
make_adt_umap(celltype_adt_df, "CD3") # bottom left corner tiny dots!
make_adt_umap(celltype_adt_df, "CD123") # everything is tumor
make_adt_umap(celltype_adt_df, "CD45") # should be on all differentiated cells more or less\
make_adt_umap(celltype_adt_df, "CD19") # b cells
To build a sense of how different references perform, let’s just look at one SCE (arbitrarily chosen as the first):
sce <- sce_list[[1]]
Functions for comparing references:
# Return a tibble of predictions from the two references
# Define outside of function to avoid rerunning over and over and over AND OVER.
hpca_ref <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
Warning: Unable to map 1470 of 19363 requested IDs.
blueprint_encode_ref <- celldex::BlueprintEncodeData(ensembl = TRUE)
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
Warning: Unable to map 532 of 19859 requested IDs.
compare_predictions <- function(sce,
label_name) {
# HPCA prediction
preds_hpca <- SingleR::SingleR(
test = sce,
ref = hpca_ref,
labels = colData(hpca_ref)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(4)
)
# Blueprint/ENCODE prediction
preds_be <- SingleR::SingleR(
test = sce,
ref = blueprint_encode_ref,
labels = colData(blueprint_encode_ref)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(4)
)
# Combine results into a wide tibble:
preds_df <- tibble::tibble(
hpca = preds_hpca$labels,
cell_barcode = rownames(preds_hpca)) %>%
dplyr::inner_join(
tibble::tibble(
blueprint_encode = preds_be$labels,
cell_barcode = rownames(preds_be))
)
return(preds_df)
}
The celldex
package contains bulk RNA-Seq datasets for use as reference. These two
are likely most useful:
The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.
The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).
main_labels_df <- compare_predictions(sce, "label.main")
Joining, by = "cell_barcode"
fine_labels_df <- compare_predictions(sce, "label.fine")
Joining, by = "cell_barcode"
How do these predictions compare? Note that these references often use different labels for the same cell type so we shouldn’t expect a perfect diagonal below:
#function for making comparison heatmaps
make_comparison_heatmap <- function(df) {
df %>%
dplyr::mutate(
hpca = forcats::fct_rev(forcats::fct_infreq(hpca)),
blueprint_encode = forcats::fct_rev(forcats::fct_infreq(blueprint_encode))
) %>%
dplyr::count(hpca, blueprint_encode) %>%
ggplot() +
aes(x = blueprint_encode, y = hpca, fill = n) +
geom_tile(color = "black") +
geom_text(aes(label = n)) +
scale_fill_distiller(palette = "RdYlBu", name = "Num. cells") +
# background grid from theme_bw does not match up with rectangles.
theme_classic() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
}
make_comparison_heatmap(main_labels_df) + ggtitle("Broad label comparison")
make_comparison_heatmap(fine_labels_df) + ggtitle("Fine label comparison")
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Monterey 12.6.1
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 stats graphics grDevices datasets utils
[7] methods base
other attached packages:
[1] ensembldb_2.20.2 AnnotationFilter_1.20.0
[3] GenomicFeatures_1.48.3 AnnotationDbi_1.58.0
[5] magrittr_2.0.3 ggplot2_3.3.6
[7] celldex_1.6.0 SingleR_1.10.0
[9] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
[11] Biobase_2.56.0 GenomicRanges_1.48.0
[13] GenomeInfoDb_1.32.3 IRanges_2.30.0
[15] S4Vectors_0.34.0 BiocGenerics_0.42.0
[17] MatrixGenerics_1.8.1 matrixStats_0.62.0
loaded via a namespace (and not attached):
[1] colorspace_2.0-3 rjson_0.2.21
[3] ellipsis_0.3.2 modeltools_0.2-23
[5] rprojroot_2.0.3 XVector_0.36.0
[7] BiocNeighbors_1.14.0 rstudioapi_0.13
[9] farver_2.1.1 flexmix_2.3-18
[11] bit64_4.0.5 interactiveDisplayBase_1.34.0
[13] fansi_1.0.3 xml2_1.3.3
[15] codetools_0.2-18 splines_4.2.1
[17] sparseMatrixStats_1.8.0 cachem_1.0.6
[19] knitr_1.39 Rsamtools_2.12.0
[21] dbplyr_2.2.1 png_0.1-7
[23] pheatmap_1.0.12 shiny_1.7.2
[25] BiocManager_1.30.18 readr_2.1.2
[27] compiler_4.2.1 httr_1.4.3
[29] lazyeval_0.2.2 assertthat_0.2.1
[31] Matrix_1.4-1 fastmap_1.1.0
[33] cli_3.3.0 later_1.3.0
[35] BiocSingular_1.12.0 miQC_1.4.0
[37] htmltools_0.5.3 prettyunits_1.1.1
[39] tools_4.2.1 rsvd_1.0.5
[41] gtable_0.3.0 glue_1.6.2
[43] GenomeInfoDbData_1.2.8 dplyr_1.0.9
[45] rappdirs_0.3.3 Rcpp_1.0.9
[47] vctrs_0.4.1 Biostrings_2.64.0
[49] rtracklayer_1.56.1 ExperimentHub_2.4.0
[51] DelayedMatrixStats_1.18.0 xfun_0.32
[53] stringr_1.4.0 beachmat_2.12.0
[55] mime_0.12 lifecycle_1.0.1
[57] irlba_2.3.5 restfulr_0.0.15
[59] renv_0.15.5 XML_3.99-0.10
[61] AnnotationHub_3.4.0 zlibbioc_1.42.0
[63] scales_1.2.0 vroom_1.5.7
[65] ProtGenerics_1.28.0 hms_1.1.1
[67] promises_1.2.0.1 parallel_4.2.1
[69] RColorBrewer_1.1-3 yaml_2.3.5
[71] curl_4.3.2 gridExtra_2.3
[73] memoise_2.0.1 biomaRt_2.52.0
[75] stringi_1.7.8 RSQLite_2.2.15
[77] BiocVersion_3.15.2 BiocIO_1.6.0
[79] ScaledMatrix_1.4.0 filelock_1.0.2
[81] BiocParallel_1.30.3 rlang_1.0.4
[83] pkgconfig_2.0.3 bitops_1.0-7
[85] evaluate_0.16 lattice_0.20-45
[87] purrr_0.3.4 labeling_0.4.2
[89] GenomicAlignments_1.32.1 bit_4.0.4
[91] tidyselect_1.1.2 here_1.0.1
[93] R6_2.5.1 generics_0.1.3
[95] DelayedArray_0.22.0 DBI_1.1.3
[97] pillar_1.8.0 withr_2.5.0
[99] KEGGREST_1.36.3 RCurl_1.98-1.8
[101] nnet_7.3-17 tibble_3.1.8
[103] crayon_1.5.1 utf8_1.2.2
[105] BiocFileCache_2.4.0 tzdb_0.3.0
[107] rmarkdown_2.14 viridis_0.6.2
[109] progress_1.2.2 grid_4.2.1
[111] blob_1.2.3 forcats_0.5.2
[113] digest_0.6.29 xtable_1.8-4
[115] tidyr_1.2.0 httpuv_1.6.5
[117] munsell_0.5.0 viridisLite_0.4.0